import os
import glob as glob
import pdb
import json

pinyin_to_id = {}

def pinyin_all():
    in_dir = '/opt/tiger/arnold_test/workstation/data/speech/data_thchs30/'
    pinyin_to_id_json_file = 'pinyin_to_id_dict.json'

    trn_files = glob.glob(os.path.join(in_dir, 'data', '*.trn'))
    index = 1
    for trn in trn_files:
        with open(trn,encoding = 'utf-8') as f:
            pinyin_line = f.readline().strip('\n')
            pinyin_line = f.readline().strip('\n')
            # pinyin_line = f.readline().strip('\n')
            pinyin_list = pinyin_line.split(' ')
            for pinyin in pinyin_list:
                if pinyin not in pinyin_to_id.keys():
                    pinyin_to_id[pinyin] = index
                    index += 1
        with open(pinyin_to_id_json_file,'w+') as f:
            json.dump(pinyin_to_id,f)
    
    pinyin_to_id['UNK'] = index
    index += 1
    pinyin_to_id['EOS'] = index
    with open(pinyin_to_id_json_file,'w+') as f:
            json.dump(pinyin_to_id,f)

if __name__ == "__main__":
    pinyin_all()
